Explainable Boosting Machine (EBM) Model

This notebook shows an example of how to use the APIs with the ExplainableBoostingMachine model from the interpret package and how to get a factual explanation for the model in the form of feature importance.

[1]:
# Load and join raw data sources and their metadata.
%run Example_InputDataSources.ipynb
[2]:
# Joined DataFrame.
df_all.head()
[2]:
id_c education_background_c professional_experience_c skills_c gender_c agg_perceived_foreign_c id_j education_reqs_j experience_reqs_role_j experience_reqs_duration_j skills_j gender_j agg_perceived_foreign_j ranking shortlisted score
0 5 [{'institution': 'Complutense University Of Ma... [{'institution': 'Stylo Milano', 'start_date':... [Communications, Social Integration, Microsoft... Man No 5 [Law Bachelor, Degree In Law, Higher Degree In... [Consultant] 12 [Punctuality, Organization, Accounting, Englis... Man No 4 1 0.0
1 6 [{'institution': 'Coronel Rosales Agricultural... [{'institution': 'Securitas Direct', 'start_da... [Refinancing, Economy, Microsoft Excel, Collec... Man No 3 [] [Sales Assistant, Saleswoman, Commercial Advisor] 12 [English, Spanish, Communications, Communicati... Man No 8 1 0.6
2 10 [{'institution': 'Complutense University Of Ma... [{'institution': 'Carrefour Express', 'start_d... [Entrepreneurship, Literacy, Web Design, Adobe... Woman No 5 [Law Bachelor, Degree In Law, Higher Degree In... [Consultant] 12 [Punctuality, Organization, Accounting, Englis... Man No 4 1 0.0
3 11 [{'institution': 'Les Ribera De Los Molinos', ... [{'institution': 'Decimas Sl', 'start_date': '... [Consulting, Sap Crm, Collections, Automation,... Woman No 3 [] [Sales Assistant, Saleswoman, Commercial Advisor] 12 [English, Spanish, Communications, Communicati... Man No 12 0 0.4
4 15 [{'institution': 'Escuela Politcnica Superior ... [{'institution': 'Reintegrate', 'start_date': ... [Microsoft Word, Biofuels, English, Entreprene... Man No 3 [] [Sales Assistant, Saleswoman, Commercial Advisor] 12 [English, Spanish, Communications, Communicati... Man No 5 1 0.7
[3]:
# Joined metadata.
md_all
[3]:
{'id_c':
        SCHEMA = {'type': 'number'}
        ATTR_TYPE = object
        ATTR_USAGE = default
        KNOWLEDGE_BASE = None,
 'education_background_c':
        SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties': {'institution': {'type': 'string'}, 'end_date': {'type': 'string'}, 'degree': {'type': 'string'}, 'duration': {'type': 'string'}}}}
        ATTR_TYPE = object
        ATTR_USAGE = default
        KNOWLEDGE_BASE = None,
 'professional_experience_c':
        SCHEMA = {'type': 'array', 'items': {'type': 'object', 'properties': {'institution': {'type': 'string'}, 'end_date': {'type': 'string'}, 'role': {'type': 'string'}, 'duration': {'type': 'string'}}}}
        ATTR_TYPE = object
        ATTR_USAGE = default
        KNOWLEDGE_BASE = None,
 'skills_c':
        SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
        ATTR_TYPE = object
        ATTR_USAGE = default
        KNOWLEDGE_BASE = None,
 'gender_c':
        SCHEMA = {'enum': ['Man', 'Woman', 'Any']}
        ATTR_TYPE = category
        ATTR_USAGE = sensitive
        KNOWLEDGE_BASE = None,
 'agg_perceived_foreign_c':
        SCHEMA = {'enum': ['No', 'Yes', 'Any']}
        ATTR_TYPE = category
        ATTR_USAGE = sensitive
        KNOWLEDGE_BASE = None,
 'id_j':
        SCHEMA = {'type': 'number'}
        ATTR_TYPE = object
        ATTR_USAGE = default
        KNOWLEDGE_BASE = None,
 'education_reqs_j':
        SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
        ATTR_TYPE = object
        ATTR_USAGE = default
        KNOWLEDGE_BASE = None,
 'experience_reqs_role_j':
        SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
        ATTR_TYPE = object
        ATTR_USAGE = default
        KNOWLEDGE_BASE = None,
 'experience_reqs_duration_j':
        SCHEMA = {'type': 'number'}
        ATTR_TYPE = object
        ATTR_USAGE = default
        KNOWLEDGE_BASE = None,
 'skills_j':
        SCHEMA = {'type': 'array', 'items': {'type': 'string'}}
        ATTR_TYPE = object
        ATTR_USAGE = default
        KNOWLEDGE_BASE = None,
 'gender_j':
        SCHEMA = {'enum': ['Man', 'Woman', 'Any']}
        ATTR_TYPE = category
        ATTR_USAGE = sensitive
        KNOWLEDGE_BASE = None,
 'agg_perceived_foreign_j':
        SCHEMA = {'enum': ['No', 'Yes', 'Any']}
        ATTR_TYPE = category
        ATTR_USAGE = sensitive
        KNOWLEDGE_BASE = None,
 'score':
        SCHEMA = {'type': 'number'}
        ATTR_TYPE = numeric
        ATTR_USAGE = target
        KNOWLEDGE_BASE = None,
 'ranking':
        SCHEMA = {'type': 'integer'}
        ATTR_TYPE = ordinal
        ATTR_USAGE = target
        KNOWLEDGE_BASE = None,
 'shortlisted':
        SCHEMA = {'type': 'integer'}
        ATTR_TYPE = category
        ATTR_USAGE = target
        KNOWLEDGE_BASE = None}
[4]:
# Importing libraries to avoid warnings at running time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
[5]:
from sklearn import set_config
set_config(transform_output = "pandas")
[6]:
# Setting category columns in DataFrame based on metadata.
cat_cols = [k for k, v in md_all.items() if v.attr_type=='category']
df_all[cat_cols] = df_all[cat_cols].astype('category')
# Dataframe metadata.
df_all.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1643 entries, 0 to 1642
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   id_c                        1643 non-null   int64
 1   education_background_c      1643 non-null   object
 2   professional_experience_c   1643 non-null   object
 3   skills_c                    1643 non-null   object
 4   gender_c                    1643 non-null   category
 5   agg_perceived_foreign_c     1643 non-null   category
 6   id_j                        1643 non-null   int64
 7   education_reqs_j            1643 non-null   object
 8   experience_reqs_role_j      1643 non-null   object
 9   experience_reqs_duration_j  1643 non-null   int64
 10  skills_j                    1643 non-null   object
 11  gender_j                    1643 non-null   category
 12  agg_perceived_foreign_j     1643 non-null   category
 13  ranking                     1643 non-null   int64
 14  shortlisted                 1643 non-null   category
 15  score                       1643 non-null   float64
dtypes: category(5), float64(1), int64(4), object(6)
memory usage: 149.9+ KB
[7]:
# Define ids, target feature(s), and predictive features.
id_cols = ['id_j', 'id_c']
target_cols = ['score', 'ranking', 'shortlisted']
pred_cols = df_all.columns.difference(target_cols + id_cols)
[8]:
from findhr.preprocess.example_mappings import RelevantExperienceForRole, ExtractMonthDurationJob, MatchOrdinal, \
    ExtractListOfProperty, MatchFeatureAtLeastInList, MatchFeatureSet, MatchBinary

# Calculated features.
maps_derived_1 = {
    (('professional_experience_c', 'experience_reqs_role_j',), ('relevant_exp_role_c',)): RelevantExperienceForRole(),
}

maps_derived_2 = {
    (('relevant_exp_role_c',), ('role_duration_months_c',)): ExtractMonthDurationJob(duration_key='duration_months'),
    (('education_background_c',), ('degree_list_c',)): ExtractListOfProperty(property_key='degree')
}

# Fitness features about the matching between candidate's features and job's requirements.
maps_matching = {
    (('experience_reqs_duration_j', 'role_duration_months_c'), ('fitness_experience',)): MatchOrdinal(),
    (('education_reqs_j', 'education_background_c'), ('fitness_education',)): MatchFeatureAtLeastInList(),
    (('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(),
    (('gender_j', 'gender_c'), ('fitness_gender',)): MatchBinary(),
    (('agg_perceived_foreign_j', 'agg_perceived_foreign_c'), ('fitness_foreign',)): MatchBinary()
}

# Helper variable for the fitness features
list_cols_fitness = ['fitness_experience', 'fitness_education', 'fitness_skills', 'fitness_gender', 'fitness_foreign']
maps_matching
[8]:
{(('experience_reqs_duration_j', 'role_duration_months_c'),
  ('fitness_experience',)): MatchOrdinal(),
 (('education_reqs_j', 'education_background_c'),
  ('fitness_education',)): MatchFeatureAtLeastInList(),
 (('skills_j', 'skills_c'), ('fitness_skills',)): MatchFeatureSet(),
 (('gender_j', 'gender_c'), ('fitness_gender',)): MatchBinary(),
 (('agg_perceived_foreign_j', 'agg_perceived_foreign_c'),
  ('fitness_foreign',)): MatchBinary()}
[9]:
# Scikit-learn transformation for numeric and categorical features
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer

numeric_features = list_cols_fitness
categorical_features = ['gender_c', 'agg_perceived_foreign_c']
# imputing and scaling numeric features
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),  # Not needed for the used dataset.
        ("scaler", StandardScaler())  # Not needed for the decision tree, let's keep it for the sake of generality.
    ]
)
# imputing and encoding categorical features
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        # Not needed for the used dataset, again for the sake of generality.
        ("encoder", OneHotEncoder()),  # Convert to one-hot encoding
    ]
)
# combining the two above
column_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        # ("cat", categorical_transformer, categorical_features)
    ],
)
[10]:
from findhr.preprocess.mapping import AttachMetadata, DetachMetadata, DerivedColumn

# The pipeline is composed of two phases:
# 1. Preprocessing with metadata (using findhr package)
pipeline_derived = Pipeline(steps=[
    ("init", AttachMetadata(md_all)),
    ('mapping_1', DerivedColumn(maps_derived_1)),
    ('mapping_2', DerivedColumn(maps_derived_2)),
    ("matching", DerivedColumn(maps_matching)),
    # ("fitness", GroundTruthLinearWeightedScorer(gt_weights_fair)),
    ("end", DetachMetadata())
])
# 2. Standard scikit-learn preprocessing to prepare the data for the model covered by column preprocessor.

[11]:
## Pipeline Including ExplainableBoostingRegressor
[12]:
# Pipeline definition for regression model on the target feature "score".
from findhr.preprocess.mapping import AttachMetadata, DerivedColumn, DetachMetadata
from interpret.glassbox import ExplainableBoostingRegressor
pipeline_regr = Pipeline(
    steps=[
        # first phase: preprocessing with metadata
        ('fitness_value', pipeline_derived
         ),
        # second phase: preprocessing without metadata (standard scikit-learn)
        ("column_preprocessor", column_preprocessor),
        # model inference
        ("regressor", ExplainableBoostingRegressor())
       ]
)
[13]:
# Model fit.
pipeline_regr.fit(df_all.loc[:, pred_cols], df_all.loc[:, 'score'])
[13]:
Pipeline(steps=[('fitness_value',
                 Pipeline(steps=[('init',
                                  AttachMetadata(metadata_dict={'agg_perceived_foreign_c':
    SCHEMA = {'enum': ['No', 'Yes', 'Any']}
    ATTR_TYPE = category
    ATTR_USAGE = sensitive
    KNOWLEDGE_BASE = None,
                                                                'agg_perceived_foreign_j':
    SCHEMA = {'enum': ['No', 'Yes', 'Any']}
    ATTR_TYPE = category
    ATTR_USAGE = sensitive
    KNOWLEDGE_BASE = None,
                                                                'education_background_c':
    SC...
                                                                                                                           output_cols=('fitness_skills',))})),
                                 ('end', DetachMetadata())])),
                ('column_preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['fitness_experience',
                                                   'fitness_education',
                                                   'fitness_skills',
                                                   'fitness_gender',
                                                   'fitness_foreign'])])),
                ('regressor', ExplainableBoostingRegressor())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
[14]:
# Example model prediction.
pipeline_regr.predict(df_all.loc[:10, pred_cols])
[14]:
array([-1.26254049e-17,  6.00000000e-01, -1.23256533e-17,  4.00000000e-01,
        7.00000000e-01, -1.23256533e-17,  3.50000000e-01, -1.26254049e-17,
       -1.23256533e-17,  7.50000000e-01, -1.23256533e-17])

Example Model Explanation

[15]:
### Get a global explanation from ExplainableBoostingRegressor
explanation_global = pipeline_regr.named_steps['regressor'].explain_global()#name=list_cols_fitness)

[16]:
# Visualize the global explanation through plotting the feature importance.
explanation_global.visualize()
[17]:
# Get the transformed data at the end before the model prediction.
idx_explicand_sample = 0
explicand_sample = df_all.loc[:, pred_cols].iloc[idx_explicand_sample:idx_explicand_sample+1]
transformed_data = pipeline_regr[:-1].transform(explicand_sample)
explanation_local = pipeline_regr.named_steps['regressor'].explain_local(transformed_data)

[18]:
# Visualize the local explanation for the first sample explained.
# See documentation at https://interpret.ml/docs/ebm.html for further details.
explanation_local.visualize(0)
[ ]: